#import the created functions package
import functions
# Importing neccessary Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
%matplotlib inline
from mpl_toolkits import mplot3d
import missingno as msno ##to visualize the missig values
##load the data in DataFrame
url = 'https://raw.githubusercontent.com/sundeepblue/movie_rating_prediction/master/movie_metadata.csv'
df = pd.read_csv(url, index_col=0)
##quick view of the dataset
df.head(3)
##get the number of rows and columns
display(df.shape)
# no of rows and columns present in the airbnb data
df_rows = df.shape[0]
df_columns = df.shape[1]
print('There are {} rows and {} columns in the imdb ratings dataset.'.format(df_rows,df_columns))
##column names
df.columns
### reset the index so that color can become a feature too
df2 = df.reset_index()
#quick check
df2.head(1)
df2.shape
#Number of unique movie title
unique_val = df2.movie_title.nunique()
print('There are {} unique values in imdb ratings dataset.'.format(unique_val))
# Number of duplicate rows based on movie title
duplicates = df2.movie_title.duplicated().sum()
print('There are {} duplicate values in imdb ratings dataset.'.format(duplicates))
## Drop the duplicate values based on movie-title
#df2 = df2.movie_title.drop_duplicates()
df2 = df2.drop_duplicates(subset='movie_title')
##count of movies after dropping the duplicates
df2.shape[0]
df2.head(2)
#while inspecting through other sources the title had 'Â' at the end so we will remove them if any exists
df2.movie_title=df2.movie_title.str.strip()
df2.movie_title=df2.movie_title.replace('Â', '', regex=True)
df2.movie_title;
##count of non-null/NA values in each columns
df2.count()
# Visualize the completeness/ missing values, the bar graphs shows how complete the data is
#Higher the bar, higher the data completeness and lower the NULL values and vice versa
# values as a bar chart
msno.bar(df2)
#using the missing_value function saved under functions module
#missing_value functions takes in a dataframe and gives a table and plot of the total nulls and percentage of nulls for each columns in the dataframe
functions.missing_value(df2)
We can see from the bar graph the table above that 'gross' and 'budget' have highest missing values with 17.5% and 9.8% null values respectively. One approach would be to drop those columns but they don't have extremely high missing values and also they seem to be important features and the budget of the film suggest the quality of actors and movies to certain extent and also the gross earning of a movie is generally high if the viewers like the movie and they both are generally reflected on IMDB scores. Therefore, it is in our interest to keep these features for further analysis. Therefore, we will only delete rows with null values for 'gross' and 'budget' because imputation will not be a good approach here. We may revisit this later and create another model to replace these missing values if needed. </font>
.
.
##removing the null values from features 'gross' and 'budget'
df3 = df2[df2['gross'].notna() & df2['budget'].notna()]
df3.shape
remaining_pct = ((df2.shape[0]-df3.shape[0])/df2.shape[0])*100
print('We have removed {} percentage of values from the previous imdb ratings dataset.'.format(remaining_pct))
We Still have 3789 rows in our dataset which is good data size
Further data cleaning
##inspectinge genre
df3.genres.head(3)
The genres contains multiple genre for the same movie. Upon intuition, it feels like genre may be important factor for IMDB score rating. So we ill first inspect if the intuition is true or not. To do this, we will create a on-hot-encoded table for all genres.
#creating a genre dataframe
genre_temp = df3[['genres','imdb_score']]
##get the list of unique genres
genrelist = genre_temp.genres.str.split('|') ##splitting the genre column
genrelist;
flattened_list = [i for x in genrelist for i in x] ##creating a flattened version of list of list
genres = list(set(flattened_list)) ##getting thee unique values from list
genres;
genrelist
##replacing the genre value with list
genre_temp.genres=genre_temp.genres.str.split('|').copy()
#reset the index so we can have matching index on which we can merge dataframe
genre_temp= genre_temp.reset_index() #resets the index and creates a seperate column naed index for the old index
genre_temp =genre_temp.drop(columns=['index']) #we drop the old index as we dont need it
genre_temp;
display(genre_temp.head(2)) ##quick view
genre_temp.shape ## check the shape to ensure the number of row matches with the previous dataframes rows
##one hot encoding
mlb = MultiLabelBinarizer() ##suing sklearns
genre_onehot = pd.DataFrame(mlb.fit_transform(genre_temp.genres), columns = mlb.classes_)
display(genre_onehot.head(2))#quick view
genre_onehot.shape## check the shape to ensure the number of row matches with the previous dataframes from which we created this
# # Merge two Dataframes on index of both the dataframes
# mergedDf = genre_temp.merge(genre_onehot, left_index=True, right_index=True)
# mergedDf.astype('int32').dtypes
# display(mergedDf)
# display(mergedDf.shape)
# mergedDf.astype({'imdb_score': 'int32'}).dtypes;
##creating a dataset with the imdb rating to get a mean imdb score for each genre
xyz = genre_onehot.multiply(genre_temp['imdb_score'], axis = 'index')
display(xyz.head(2))
xyz.shape
#replacing the 0s with NaNs so that we can get a mean score for for each genre
xyz2 =xyz.replace(0, np.NaN)
#getting the mean score for each genre
means = xyz2.mean(axis = 0)
means;
#means2 = means
#means.plot.bar()
##converting the series to datafram so we can get better plots using plotly
##resetting the index which the series dataframe has genre as the index and we need genre to plot
means = means.to_frame().reset_index()
#renaming the columns to better names
means = means.rename(columns={'index': "Genre", 0: "Avg_IMDB_rating"})
means.head(2)
##using plotly to generate the bar plots
fig = px.bar(means, x = "Genre", y = "Avg_IMDB_rating" )
fig.show()
We can see that maximum of the genre have similar score that ranges between 6.2 to 7.2 so GENRE may not be a good feature for us as it will not provide any useful insight. So, we will be dropping genre for the initial model and maybe revisit later if needed
##checking for complete cases i.e number of rows without any missing value/ NaNs in any of the columns
complete_cases = df3[~df3.isna().any(axis=1)].shape[0]
print('We have {} complete cases i.e number of rows without any missing value/ NaNs in any of the columns'.format(complete_cases))
print('We still have {} rows with missing value in any of the columns'.format(df3.shape[0]-complete_cases))
.
#using the missing_value function saved under functions module
#missing_value functions takes in a dataframe and gives a table and plot of the total nulls and percentage of nulls for each columns in the dataframe
functions.missing_value(df3)
We can see that feature 'aspect_ratio' has the highest missing values which is 74 values which constitues of 1.95% percentage of the data. Before, we impute the data we want to inspect the feature
##inspecting aspect_ratio
display(df3.aspect_ratio.unique())
df3.aspect_ratio.value_counts()
ar1 = np.mean(df3.imdb_score[df3.aspect_ratio == 1.85]) ##mean imdb_score for movies with aspect_ratio of 1.85
ar2 = np.mean(df3.imdb_score[df3.aspect_ratio == 2.35]) ##mean imdb_score for movies with aspect_ratio of 2.35
ar3 = np.mean(df3.imdb_score[(df3.aspect_ratio != 2.35) & (df3.aspect_ratio != 1.85)]) ##mean imdb_score for movies with remaining aspect_ratio
print('The mean imdb_score for movies with aspect_ratio of 1.85 is {} '.format(ar1))
print('The mean imdb_score for movies with aspect_ratio of 2.35 is {} '.format(ar2))
print('The mean imdb_score for movies with aspect_ratio other than 1.85 and 2.35 is {}'.format(ar3))
From this we can see that the mean imdb_score for movies any aspect ratio is similar ranging from 6.3 to 6.6. Since, there is not much difference in the score for different aspect ratio, 'aspect_ratio' feature will not give us much informatin and can be dropped
By inspecting feature 'genre', we concluded that we will remove genre
We will also remove 'movie_imdb_link' as it does not give any information for our analysis
##Removing aspect_ratio and genres
df4 = df3.drop(columns=['genres', 'aspect_ratio'])
df4 = df4.drop(columns=['movie_imdb_link'])
##once again inspecting for missing value from remaining data
#functions.missing_value(df4)
df4.info()
# # to change use .astype()
# #df2['country'] = df2.country.astype(float)
# #df2.astype({'country': 'float64'}).dtypes.copy()
# df2["country"] = df2.country.astype(float)
df4.describe()
##count of null values ins each columns
df4.isnull().sum()
The rest of the data have quite less missing values and we can impute the data for those missing value so that we do not lose any more data points</note>
##dcreating dataframe havinf numeric columns with missing values
df_numeric = df4[['num_critic_for_reviews','duration','actor_3_facebook_likes','actor_1_facebook_likes','facenumber_in_poster','actor_2_facebook_likes']]
df_numeric;
#count of missing values for eath column
df_numeric.isna().sum()
#imputing using MICE (sklearn.impute contains IterativeImputer which uses MICE to impute)
mice_imputer= IterativeImputer()
imputed=mice_imputer.fit_transform(df_numeric)
#the sklearn iterativeimputer outputs the data in numpy array form
imputed;
##converting the imputed numpy array into dataframe adn giving them column names in the order of dataframe which was used to impute
df_imputed=pd.DataFrame(imputed, columns=['num_critic_for_reviews','duration','actor_3_facebook_likes','actor_1_facebook_likes','facenumber_in_poster','actor_2_facebook_likes'])
df_imputed;
#check if the imputation worked or not-- there should be 0 for all numeric columns
df_imputed.isna().sum()
#creating a copy of the dataframe and updating it with the imputed data-points
df5 = df4
df5 = df5.reset_index().drop(columns=['index'])
df5.update(df_imputed)
##cheking the count of missing data in the columns
df5.isna().sum()
##content_rating
df5.content_rating.unique()
##removing the null values from features 'content_rating'
df6 = df5[df5['content_rating'].notna()]
df6.shape
df6['content_rating'].value_counts()
According to the history of naming these different content ratings, we find,PG-13 = G, M = GP = PG, X = NC-17. We want to replace PG-13 with G, M and GP with PG, replace X with NC-17, because these two are what we use nowadays.
df6['content_rating'] = df6['content_rating'].replace('PG-13', 'G')
df6['content_rating'] = df6['content_rating'].replace(['M','GP'], 'G')
df6['content_rating'] = df6['content_rating'].replace('X', 'NC-17')
df6['content_rating'] = df6['content_rating'].replace(['Approved','Passed','Unrated','Not Rated'], 'R')
##checking after cleaning the column
display(df6['content_rating'].value_counts())
##plotting the data
df6['content_rating'].value_counts().plot.bar()
#Adding new columns for the purpose of EDA
#profilt column
df6['profit'] = df6.gross - df6.budget
#plotting the 'color' column
a=df6.color.value_counts()
display(a)
a.plot.bar()
color_p=(3614/(3614+122))*100
bw_p=(122/(3614+122))*100
print('The color column in dataset consists of {} percent color and {} percent black&white'.format(color_p, bw_p))
Since, the color column is extremely skedwed towards color this feature will not provide much help in our model. Thus, we will drop color column
#plotting the 'language' column
lang= df6.language.value_counts()
display(lang)
lang.plot.bar()
We can see that for the 'language' column as well the data is extremely skewed and almost all the movies in the dataset has English as the language. This feature also will not provide much help in our model therefore we will drop language column as well
##Removing 'language' and 'color' features
##removing the null values from features 'content_rating'
df7 = df6.drop(columns=['color','language'])
df7.shape
df7.columns;
#plotting the 'country' column
c= df7.country.value_counts()
display(c)
c.plot.bar()
We can see that majority of the country is USA and then the second is UK followed by France, Germany and Canada. Since all other have less that 50 value we will group them all as 'Other'
#df7 = df6 ##making a copy of dataframe
##replacing all other countries except the specified ones as Other
df7.loc[~df7["country"].isin(['USA','UK','France','Germany','Canada']), "country"] = "Other"
#plotting the 'country' column
cc= df7.country.value_counts()
display(cc)
cc.plot.bar()
# fig = px.histogram(df7, x="title_year")
# fig.update_layout(
# title_text='Movies release over the years', # title of plot
# xaxis_title_text='title_year', # xaxis label
# yaxis_title_text='Count', # yaxis label
# bargap=0.1, # gap between bars of adjacent location coordinates
# #bargroupgap=0.1 # gap between bars of the same location coordinates
# )
# fig.show()
fig = go.Figure(data=[go.Histogram(x=df7.title_year)])
fig.update_layout(
title={
'text': "Movies release over the years",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title_text='Year', # xaxis label
yaxis_title_text='Count', # yaxis label
bargap=0.2, # gap between bars of adjacent location coordinates
)
fig.update_traces(marker_color='rgb(128,0,128)',opacity=0.4)
fig.show()
##Taking data for movies replease on or after 1980
df8 = df7[df7.title_year >= 1980]
#df8 = df7
#df8 = df8[df8.title_year >= 1980]
#df7.shape
#df8.shape
df8.columns;
##we will be looking at top 15 movies
ccc = df8.nlargest(15, ['profit'])
##using plotly to generate the bar plots
# fig = px.bar(ccc, x = "movie_title", y = "profit" )
# fig.show()
fig = px.bar(ccc, x='movie_title', y='profit',
hover_data=['director_name', 'title_year', 'budget', 'gross','actor_1_name'], #color='lifeExp',
labels={'Movies by profit'}, height=600)
# fig = go.Figure(data=[go.Bar(x=ccc)])
fig.update_layout(
title={
'text': "Movies by profit",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title_text='Movie title', # xaxis label
yaxis_title_text='profit', # yaxis label
#bargap=0.2, # gap between bars of adjacent location coordinates
)
fig.update_traces(marker_color='rgb(128,128,128)',opacity=0.75)
display(ccc[['movie_title','title_year']])
fig.show()
##we will be looking at top 15 movies
ccc = df8.nlargest(15, ['gross'])
##using plotly to generate the bar plots
# fig = px.bar(ccc, x = "movie_title", y = "profit" )
# fig.show()
fig = px.bar(ccc, x='movie_title', y='gross',
hover_data=['director_name','title_year', 'budget', 'gross','actor_1_name'], #color='lifeExp',
labels={'Movies by Gross earning'}, height=650)
# fig = go.Figure(data=[go.Bar(x=ccc)])
fig.update_layout(
title={
'text': "Movies by Gross earning",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title_text='Movie title', # xaxis label
yaxis_title_text='Gross Earning', # yaxis label
#bargap=0.2, # gap between bars of adjacent location coordinates
)
fig.update_traces(marker_color='rgb(66,12,9)',opacity=0.75)
display(ccc[['movie_title','gross']])
fig.show()
##we will be looking at top 15 movies
ccc = df8.nlargest(15, ['budget'])
##using plotly to generate the bar plots
# fig = px.bar(ccc, x = "movie_title", y = "profit" )
# fig.show()
fig = px.bar(ccc, x='movie_title', y='budget',
hover_data=['director_name', 'budget', 'gross','actor_1_name'], #color='lifeExp',
labels={'Movies by Budget'}, height=650)
# fig = go.Figure(data=[go.Bar(x=ccc)])
fig.update_layout(
title={
'text': "Movies by Budget",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title_text='Movie title', # xaxis label
yaxis_title_text='Movie budget', # yaxis label
#bargap=0.2, # gap between bars of adjacent location coordinates
)
fig.update_traces(marker_color='rgb(106,13,73)',opacity=0.75)
display(ccc[['movie_title','budget']])
fig.show()
Relation between highest score, budget and country
ccc = df8.nlargest(35, ['imdb_score'])
df = ccc
fig = px.scatter(df, x="imdb_score", y="budget", facet_col="country", color="movie_title")
fig.show()
Relation between highest score, gross earning and country
ccc = df8.nlargest(35, ['imdb_score'])
df = ccc
fig = px.scatter(df, x="imdb_score", y="gross", facet_col="country", color="movie_title")
fig.show()
ccc = df8.nlargest(200, ['imdb_score'])
df = ccc
fig = px.scatter(df, x="imdb_score", y="gross", color="movie_title")
fig.show()
ccc = df8.nsmallest(300, ['imdb_score'])
df = ccc
fig = px.scatter(df, x="imdb_score", y="gross", color="movie_title")
fig.show()
ccc = df8
df = ccc
fig = px.scatter(df, x="imdb_score", y="movie_facebook_likes", color="content_rating")
fig.show()
ccc = df8.nlargest(200, ['profit'])
df = ccc
fig = px.scatter(df, x="imdb_score", y="gross",size ="profit" , color="content_rating", hover_data=["movie_title"])
fig.add_shape(
# Line Horizontal
type="line",
x0=7,
y0=0,
x1=7,
y1=900000000,
line=dict(
color="LightSeaGreen",
width=4,
dash="dashdot",
),
)
fig.add_shape(
# Line Horizontal
type="line",
x0=4,
y0=400000000,
x1=9.5,
y1=400000000,
line=dict(
color="LightSeaGreen",
width=4,
dash="dashdot",
),
)
fig.show()
df8.director_name.nunique()
##unique actor names in each category
df8[['actor_1_name','actor_2_name','actor_3_name']].nunique()
## unique actor names combined
df8.groupby(['actor_1_name','actor_2_name','actor_3_name']).ngroups
##Removing the above mentioned columns
df9 = df8.drop(columns=['director_name', 'actor_1_name','actor_2_name','actor_3_name','plot_keywords'])
df9 = df9.drop(columns=['profit'])
##Checking if there is any minning values
df9.isna().sum()
# Rescale the font size of the seaborn plots
sns.set(font_scale=1)
# Calculate the correlation between all the variables
c = df9.corr()
# Use heatmap to see the correlation between all the variables (including target variables)
plt.figure(figsize=(30,15))
sns.heatmap(c, annot=True)
df9.columns
df9['user_critic_ratio'] = df9['num_critic_for_reviews'] / df9['num_user_for_reviews']
df10 = df9.drop(columns=['movie_title','num_critic_for_reviews','num_user_for_reviews','actor_1_facebook_likes','actor_2_facebook_likes','actor_3_facebook_likes'])
df10.content_rating.value_counts()
#df# Convert country to one hot encoding
df10['USA'] = [1 if i == 'USA' else 0 for i in df10.country]
df10['UK'] = [1 if i == 'UK' else 0 for i in df10.country]
df10['France'] = [1 if i == 'France' else 0 for i in df10.country]
df10['Germany'] = [1 if i == 'Germany' else 0 for i in df10.country]
df10['Canada'] = [1 if i == 'Canada' else 0 for i in df10.country]
df10['Other'] = [1 if i == 'Other' else 0 for i in df10.country]
df10['R'] = [1 if i == 'R' else 0 for i in df10.country]
df10['G'] = [1 if i == 'G' else 0 for i in df10.country]
df10['PG'] = [1 if i == 'PG' else 0 for i in df10.country]
df10['NC-17'] = [1 if i == 'NC-17' else 0 for i in df10.country]
df10 = df10.drop(columns=['country','content_rating'])
df10;
# Calculate the correlation between all the variables
c = df10.corr()
# Use heatmap to see the correlation between all the variables (including target variables)
plt.figure(figsize=(30,15))
sns.heatmap(c, annot=True)
# Rescale the font size of seaborn plots
sns.set(font_scale=2)
# Use pairplot to see how the data is distributed with each other
#sns.pairplot(df10[df10.columns.to_list()])
# dfx.columns
#df10.to_csv('df10.csv', index = False)
#dfx = pd.read_csv('df10.csv')
# Make a copy of final cleaned dataset
dfx = df10.copy()
# Import model from scikit learn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
dfx.head(3)
print(dfx.columns.tolist())
dfx = dfx[['duration', 'gross', 'budget', 'num_voted_users', 'facenumber_in_poster','director_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes', 'user_critic_ratio','USA', 'UK', 'France', 'Germany', 'Canada', 'Other','R', 'G', 'PG', 'NC-17','imdb_score']]
dfx.head(3)
# Normalization function
def normalization(df):
return (df.values - df.values.min()) / (df.values.max() - df.values.min())
dfx['duration'] = normalization(dfx.duration)
dfx['gross'] = normalization(dfx.gross)
dfx['budget'] = normalization(dfx.budget)
dfx['num_voted_users'] = normalization(dfx.num_voted_users)
dfx['facenumber_in_poster'] = normalization(dfx.facenumber_in_poster)
dfx['director_facebook_likes'] = normalization(dfx.director_facebook_likes)
dfx['cast_total_facebook_likes'] = normalization(dfx.cast_total_facebook_likes)
dfx['movie_facebook_likes'] = normalization(dfx.movie_facebook_likes)
dfx['user_critic_ratio'] = normalization(dfx.user_critic_ratio)
dfx.imdb_score.head(23)
# Separate dependet variables and independent variables from the original dataset
X = dfx[dfx.columns[:-1]]
y = dfx[dfx.columns[-1]]
# Split dataset into train and test with a ratio 0.8 to 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
import xgboost as xgb
from sklearn.metrics import mean_squared_error
data_dmatrix = xgb.DMatrix(data=X,label=y)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.3,
max_depth = 100, alpha = 15, n_estimators = 500)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
r2_score(y_test, preds)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
# Initialize a Random Forest Model (hyperparameter is not fine tuned)
rf = RandomForestRegressor(n_estimators=500, max_depth=100, min_samples_leaf=5)
# Train the Model
rf.fit(X_train, y_train)
# Predict results on the test dataset
y_hat1 = rf.predict(X_test)
rmse = np.sqrt(mse(y_test, y_hat1))
rmse
r2_score(y_test, y_hat1)
#import required packages
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
rmse_val = [] #to store rmse values for different k
for K in range(20):
K = K+1
model = neighbors.KNeighborsRegressor(n_neighbors = K)
model.fit(X_train, y_train) #fit the model
pred=model.predict(X_test) #make prediction on test set
error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
rmse_val.append(error) #store rmse values
print('RMSE value for k= ' , K , 'is:', error)
#plotting the rmse values against k values
curve = pd.DataFrame(rmse_val) #elbow curve
curve.plot()
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13]}
knn = neighbors.KNeighborsRegressor()
model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
model.best_params_
model = neighbors.KNeighborsRegressor(n_neighbors = 12)
model.fit(X_train, y_train) #fit the model
pred=model.predict(X_test) #make prediction on test set
error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
print('RMSE value for k= ' , K , 'is:', error)
r2_score(y_test, pred)
X_train.shape
import numpy as np
import pandas as pd
import keras
import keras.backend as kb
import tensorflow as tf
model = keras.Sequential([
keras.layers.Dense(128, activation=tf.nn.relu, input_shape=[19]),
keras.layers.Dense(64, activation=tf.nn.relu),
keras.layers.Dense(32, activation=tf.nn.relu),
keras.layers.Dense(1)
])
optimizer = tf.keras.optimizers.Adam(0.0099)
model.compile(loss='mean_squared_error',optimizer=optimizer)
model.fit(X_train,y_train,epochs=500)
pred = model.predict(X_test)
error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
print('RMSE is:', error)
r2_score(y_test, pred)